Covid-19 dataset - object from Rik and running celltypist

Import packages

In [3]:
import numpy as np
import pandas as pd
import scanpy as sc
import scanpy.api as sc
import scrublet as scr
import seaborn as sns
import scipy.stats
import anndata
import os

import scipy as scipy
import scipy as sp
import pickle as pkl
import matplotlib.pyplot as plt
import re
from collections import defaultdict
from statsmodels.nonparametric.smoothers_lowess import lowess
from numpy import asarray as ar
from collections import Counter
import networkx as nx
import igraph
import glob

import sys
sys.path.append("/mnt/00_Tools/")
from jp_single import *
from bbknn import bbknn

os.getcwd()

sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()

sc.settings.set_figure_params(dpi=100, color_map='Blues')
sc.logging.print_version_and_date()
%load_ext autoreload
%autoreload 2

result_file = './write/adata_PIP-B_T-all.h5ad'
scanpy==1.4.5.post3 anndata==0.6.22.post1 umap==0.4.6 numpy==1.19.2 scipy==1.5.2 pandas==1.1.3 scikit-learn==0.23.1 statsmodels==0.11.1 python-igraph==0.7.1 louvain==0.6.1 leidenalg==0.7.0
Running Scanpy 1.4.5.post3, on 2020-10-06 12:32.
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
In [4]:
sys.path.append("/mnt/thymus_atlas")
import scjp

Reading samples

The R object was converted to .h5ad format using the sceasy package: https://github.com/cellgeni/sceasy

In [5]:
adata = sc.read('/mnt/COVID-19/cpm.h5ad')
In [6]:
adata.uns
Out[6]:
OverloadedDict, wrapping:
	OrderedDict()
With overloaded keys:
	['neighbors'].
In [7]:
adata.obs.columns
Out[7]:
Index(['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'RNA_snn_res.1',
       'seurat_clusters', 'cell.type', 'predictedCluster',
       'seurat_clusters_numeric', 'manual_celltype', 'cellType', 'barcode',
       'status', 'assignment', 'log_prob_singleton', 'log_prob_doublet',
       'cluster0', 'cluster1', 'originalSample', 'ID', 'ageGroup',
       'idAndBatch'],
      dtype='object')
In [8]:
adata.obs.head(1)
Out[8]:
orig.ident nCount_RNA nFeature_RNA RNA_snn_res.1 seurat_clusters cell.type predictedCluster seurat_clusters_numeric manual_celltype cellType ... status assignment log_prob_singleton log_prob_doublet cluster0 cluster1 originalSample ID ageGroup idAndBatch
S1_AAACCTGAGACAGACC-1 S1 2149.714502 1142 2 2 B cells 2: B cells (7494 0.45) 2.0 B Cells B Cells 2. B Cells ... singlet 1 -329.161474 -477.674388 -1012.228842 -329.161474 CV001_KM9166541-CV001_KM9166568 NP22 Infant NP22_S1

1 rows × 21 columns

In [9]:
sc.pl.umap(adata, color='manual_celltype')
/home/ubuntu/.local/lib/python3.6/site-packages/anndata/_core/anndata.py:1192: FutureWarning:

is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead

In [10]:
sc.pl.umap(adata, color='predictedCluster')
In [11]:
adata.obsm
Out[11]:
AxisArrays with keys: X_harmony, X_pca, X_umap, X_umapafterharmony_adt

Rik has saved several layers of the data but recommends to use the harmony corrected layer

In [12]:
adata.obsm['X_umap'] = adata.obsm['X_umapafterharmony_adt']
In [13]:
sc.pl.umap(adata, color='predictedCluster')

allData_highest

In [14]:
lr = pkl.load(open('/mnt/PIP/celltypist/manual_models/Immune_v5_allData_highest_all.pkl','rb'))
/home/ubuntu/.local/lib/python3.6/site-packages/sklearn/utils/deprecation.py:143: FutureWarning:

The sklearn.linear_model.stochastic_gradient module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.linear_model. Anything that cannot be imported from sklearn.linear_model is now part of the private API.

/home/ubuntu/.local/lib/python3.6/site-packages/sklearn/utils/deprecation.py:143: FutureWarning:

The sklearn.linear_model.sgd_fast module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.linear_model. Anything that cannot be imported from sklearn.linear_model is now part of the private API.

/home/ubuntu/.local/lib/python3.6/site-packages/sklearn/base.py:334: UserWarning:

Trying to unpickle estimator SGDClassifier from version 0.20.3 when using version 0.23.1. This might lead to breaking code or invalid results. Use at your own risk.

In [15]:
CT_genes = pd.read_csv('/mnt/PIP/celltypist/manual_models/HumanAtlas_inters_ptprc_plus_genes.csv', header=None)
In [16]:
CT_genes.head(1)
Out[16]:
0
0 NOC2L
In [17]:
CT_genes['idx'] = CT_genes.index
In [18]:
CT_genes.head(1)
Out[18]:
0 idx
0 NOC2L 0
In [19]:
CT_genes.columns = ['symbol', 'idx']
In [20]:
CT_genes = np.array(CT_genes['symbol'])
In [21]:
lr['Model'].features = CT_genes
In [22]:
lr = lr['Model']
In [23]:
#features is the gene names of the dataset to be annotated

features = adata.var_names
In [24]:
lr.features
Out[24]:
array(['NOC2L', 'ISG15', 'C1orf159', ..., 'STX18-AS1', 'AC027290.2',
       'AC116655.1'], dtype=object)
In [25]:
k_x = features.isin(list(lr.features))
print(f'{k_x.sum()} features used for prediction', file=sys.stderr)
14561 features used for prediction
In [26]:
k_x = features.isin(list(lr.features))
print(f'{k_x.sum()} features used for prediction', file=sys.stderr)
k_x_idx = np.where(k_x)[0]
X = adata.X[:, k_x_idx]
features = features[k_x]

ad_ft = pd.DataFrame(features.values, columns=['ad_features']).reset_index().rename(columns={'index': 'ad_idx'})
lr_ft = pd.DataFrame(lr.features, columns=['lr_features']).reset_index().rename(columns={'index': 'lr_idx'})
lr_idx = lr_ft.merge(ad_ft, left_on='lr_features', right_on='ad_features').sort_values(by='ad_idx').lr_idx.values

lr.n_features_in_ = lr_idx.size
lr.features = lr.features[lr_idx]
lr.coef_ = lr.coef_[:, lr_idx]
14561 features used for prediction
In [27]:
predicted_hi = lr.predict(X)
In [28]:
predicted_hi.shape
Out[28]:
(81912,)
In [29]:
adata.obs['predicted_hi'] = predicted_hi
In [30]:
sc.settings.set_figure_params(dpi=100, color_map='Blues')
sc.pl.umap(adata, color = 'predicted_hi')
/home/ubuntu/.local/lib/python3.6/site-packages/anndata/_core/anndata.py:1192: FutureWarning:

is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead

... storing 'predicted_hi' as categorical
In [32]:
sc.settings.set_figure_params(dpi=100, color_map='Blues')
sc.pl.umap(adata, color = 'predicted_hi', groups=['B cells','ILC','Monocytes', 'T cells', 'DC'])
In [33]:
sc.settings.set_figure_params(dpi=100, color_map='Blues')
sc.pl.umap(adata, color = 'predicted_hi')
In [34]:
sc.settings.set_figure_params(dpi=100, color_map='Blues')
sc.pl.umap(adata, color = 'manual_celltype')

allData_lowest

In [35]:
lr = pkl.load(open('/mnt/PIP/celltypist/manual_models/Immune_v5_allData_lowest_all.pkl','rb'))
/home/ubuntu/.local/lib/python3.6/site-packages/sklearn/base.py:334: UserWarning:

Trying to unpickle estimator SGDClassifier from version 0.20.3 when using version 0.23.1. This might lead to breaking code or invalid results. Use at your own risk.

In [36]:
CT_genes = pd.read_csv('/mnt/PIP/celltypist/manual_models/HumanAtlas_inters_ptprc_plus_genes.csv', header=None)
In [37]:
CT_genes.head(1)
Out[37]:
0
0 NOC2L
In [38]:
CT_genes['idx'] = CT_genes.index
In [39]:
CT_genes.head(1)
Out[39]:
0 idx
0 NOC2L 0
In [40]:
CT_genes.columns = ['symbol', 'idx']
In [41]:
CT_genes = np.array(CT_genes['symbol'])
In [42]:
lr['Model'].features = CT_genes
In [43]:
lr = lr['Model']
In [44]:
#features is the gene names of the dataset to be annotated

features = adata.var_names
In [45]:
lr.features
Out[45]:
array(['NOC2L', 'ISG15', 'C1orf159', ..., 'STX18-AS1', 'AC027290.2',
       'AC116655.1'], dtype=object)
In [46]:
k_x = features.isin(list(lr.features))
print(f'{k_x.sum()} features used for prediction', file=sys.stderr)
14561 features used for prediction
In [47]:
k_x = features.isin(list(lr.features))
print(f'{k_x.sum()} features used for prediction', file=sys.stderr)
k_x_idx = np.where(k_x)[0]
X = adata.X[:, k_x_idx]
features = features[k_x]

ad_ft = pd.DataFrame(features.values, columns=['ad_features']).reset_index().rename(columns={'index': 'ad_idx'})
lr_ft = pd.DataFrame(lr.features, columns=['lr_features']).reset_index().rename(columns={'index': 'lr_idx'})
lr_idx = lr_ft.merge(ad_ft, left_on='lr_features', right_on='ad_features').sort_values(by='ad_idx').lr_idx.values

lr.n_features_in_ = lr_idx.size
lr.features = lr.features[lr_idx]
lr.coef_ = lr.coef_[:, lr_idx]
14561 features used for prediction
In [48]:
predicted_lo = lr.predict(X)
In [49]:
predicted_lo.shape
Out[49]:
(81912,)
In [50]:
adata.obs['predicted_lo'] = predicted_lo
In [51]:
sc.settings.set_figure_params(dpi=100, color_map='Blues')
sc.pl.umap(adata, color = 'predicted_lo')
/home/ubuntu/.local/lib/python3.6/site-packages/anndata/_core/anndata.py:1192: FutureWarning:

is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead

... storing 'predicted_lo' as categorical
In [52]:
value_counts = adata.obs['predicted_lo'].value_counts()
In [53]:
type(value_counts)
Out[53]:
pandas.core.series.Series
In [54]:
value_counts = pd.DataFrame(value_counts)
In [55]:
value_counts.head(1)
Out[55]:
predicted_lo
Tcm/Naive helper-T cells 18565
In [56]:
value_counts.columns
Out[56]:
Index(['predicted_lo'], dtype='object')
In [59]:
value_counts_2 = value_counts[~(value_counts['predicted_lo'] <= 100)]  
In [60]:
value_counts_2
Out[60]:
predicted_lo
Tcm/Naive helper-T cells 18565
regulatory-T cells 14400
Monocytes 11016
NK cells 10379
cytotoxic-T cells 9182
Tcm/Naive cytotoxic-T cells 4304
B cells 2483
follicular B cells 2324
Naive B cells 2023
DC2 1913
memory-B cells 1887
T cells 732
Monocyte precursor 325
hcaImmune18BoneMarrowFalse_24 241
hcaImmune18BoneMarrowFalse_28 214
pDC 181
Macrophages 171
Pre-B cells 152
Mid erythroid 118
Granulocytes 107
Treg(diff) 105
CD8a/a 104
In [61]:
ct_predicted = value_counts_2.index
In [73]:
sc.settings.set_figure_params(dpi=100, color_map='Blues')
sc.pl.umap(adata, color = 'predicted_lo', groups=['Tcm/Naive helper-T cells','Tcm/Naive cytotoxic-T cells','T cells','NK cells','cytotoxic-T cells','memory CD4+ cytotoxic-T cells','follicular B cells','B cells','memory-B cells','ILC','Monocytes',  'DC2'])
In [63]:
ct_predicted = list(ct_predicted)
In [64]:
sc.settings.set_figure_params(dpi=100, color_map='Blues')
sc.pl.umap(adata, color = 'predicted_lo', groups=ct_predicted)
In [65]:
adata.write('/mnt/COVID-19/covid19.h5ad')
/home/ubuntu/.local/lib/python3.6/site-packages/anndata/_core/anndata.py:1192: FutureWarning:

is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead

Heatmap output

In [77]:
matplotlib.rcParams.update({'figure.figsize': (25, 10)})
matplotlib.rc('xtick', labelsize=8) 
crosstab = pd.crosstab(adata.obs.predicted_hi, adata.obs.predictedCluster).plot(kind='bar', stacked=True)
In [78]:
freq = pd.crosstab(adata.obs.predicted_hi, adata.obs.predictedCluster)
In [79]:
freq = freq.apply(lambda x: 1 * x / x.sum())
In [80]:
rc={'ytick.labelsize': 8}
sns.set(rc=rc)
In [81]:
sns.clustermap(freq, xticklabels=True, yticklabels=True)
Out[81]:
<seaborn.matrix.ClusterGrid at 0x7f6ae3746cf8>
In [82]:
matplotlib.rcParams.update({'figure.figsize': (25, 10)})
matplotlib.rc('xtick', labelsize=8) 
crosstab = pd.crosstab(adata.obs.predicted_lo, adata.obs.predictedCluster).plot(kind='bar', stacked=True)
In [83]:
freq = pd.crosstab(adata.obs.predicted_lo, adata.obs.predictedCluster)
In [84]:
freq = freq.apply(lambda x: 1 * x / x.sum())
In [85]:
rc={'ytick.labelsize': 8}
sns.set(rc=rc)
In [86]:
sns.clustermap(freq, xticklabels=True, yticklabels=True)
Out[86]:
<seaborn.matrix.ClusterGrid at 0x7f6ab66709b0>